{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "67c1e6b1",
   "metadata": {},
   "source": [
    "\n",
    "\n",
    "下面的例子将展示词向量标准工具包——gensim提供的词嵌入，并展示词嵌入如何表示词的相似度。\n",
    "<!-- https://nlp.stanford.edu/projects/glove/ -->"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "5c5a740a",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pprint\n",
    "\n",
    "from gensim.models import KeyedVectors\n",
    "\n",
    "# 从GloVe官网下载GloVe向量，此处使用的是glove.6B.zip\n",
    "# 解压缩zip文件并将以下路径改为解压后对应文件的路径\n",
    "model = KeyedVectors.load_word2vec_format('D:/py/@Hands-on-NLP-main/glove.6B.100d.txt', binary=False, no_header=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "01a2e4a5",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[('movie', 0.9055121541023254),\n",
      " ('films', 0.8914433717727661),\n",
      " ('directed', 0.8124364018440247),\n",
      " ('documentary', 0.8075793981552124),\n",
      " ('drama', 0.7929168939590454),\n",
      " ('movies', 0.7889865040779114),\n",
      " ('comedy', 0.7842751741409302),\n",
      " ('starring', 0.7573286294937134),\n",
      " ('cinema', 0.7419455647468567),\n",
      " ('hollywood', 0.7307389378547668)]\n",
      "[('vehicle', 0.8630837798118591),\n",
      " ('truck', 0.8597878813743591),\n",
      " ('cars', 0.837166965007782),\n",
      " ('driver', 0.8185911178588867),\n",
      " ('driving', 0.7812635898590088),\n",
      " ('motorcycle', 0.7553157210350037),\n",
      " ('vehicles', 0.7462256550788879),\n",
      " ('parked', 0.74594646692276),\n",
      " ('bus', 0.7372707724571228),\n",
      " ('taxi', 0.7155268788337708)]\n"
     ]
    }
   ],
   "source": [
    "# 使用most_similar()找到词表中距离给定词最近（最相似）的n个词\n",
    "pprint.pprint(model.most_similar('film'))\n",
    "pprint.pprint(model.most_similar('car'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "8b62f7ad",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "japanese\n",
      "panda\n",
      "longest\n",
      "terrible\n",
      "queen\n"
     ]
    }
   ],
   "source": [
    "# 利用GloVe展示一个类比的例子\n",
    "def analogy(x1, x2, y1):\n",
    "    # 寻找top-N最相似的词。\n",
    "    result = model.most_similar(positive=[y1, x2], negative=[x1])\n",
    "    return result[0][0]\n",
    "\n",
    "print(analogy('china', 'chinese', 'japan'))\n",
    "print(analogy('australia', 'koala', 'china'))\n",
    "print(analogy('tall', 'tallest', 'long'))\n",
    "print(analogy('good', 'fantastic', 'bad'))\n",
    "print(analogy('man', 'woman', 'king'))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0c308cee",
   "metadata": {},
   "source": [
    "下面将展示word2vec的代码，包括文本预处理、skipgram算法的实现、以及使用PyTorch进行优化。这里使用《小王子》这本书作为训练语料。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "590fc408",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(76044, 2) [[  4  16]\n",
      " [  4  19]\n",
      " [ 16   4]\n",
      " ...\n",
      " [130   3]\n",
      " [  3  86]\n",
      " [  3 130]]\n"
     ]
    }
   ],
   "source": [
    "# 安装NLTK，使用如下代码下载punkt组件\n",
    "#import nltk\n",
    "#nltk.download('punkt')\n",
    "\n",
    "from nltk.tokenize import sent_tokenize, word_tokenize\n",
    "from collections import defaultdict\n",
    "\n",
    "# 使用类管理数据对象，包括文本读取、文本预处理等\n",
    "class TheLittlePrinceDataset:\n",
    "    def __init__(self, tokenize=True):\n",
    "        # 利用NLTK函数进行分句和分词\n",
    "        text = open('the little prince.txt', 'r', encoding='utf-8').read()\n",
    "        if tokenize:\n",
    "            self.sentences = sent_tokenize(text.lower())\n",
    "            self.tokens = [word_tokenize(sent) for sent in self.sentences]\n",
    "        else:\n",
    "            self.text = text\n",
    "\n",
    "    def build_vocab(self, min_freq=1):\n",
    "        # 统计词频\n",
    "        frequency = defaultdict(int)\n",
    "        for sentence in self.tokens:\n",
    "            for token in sentence:\n",
    "                frequency[token] += 1\n",
    "        self.frequency = frequency\n",
    "\n",
    "        # 加入<unk>处理未登录词，加入<pad>用于对齐变长输入进而加速\n",
    "        self.token2id = {'<unk>': 1, '<pad>': 0}\n",
    "        self.id2token = {1: '<unk>', 0: '<pad>'}\n",
    "        for token, freq in sorted(frequency.items(), key=lambda x: -x[1]):\n",
    "            # 丢弃低频词\n",
    "            if freq > min_freq:\n",
    "                self.token2id[token] = len(self.token2id)\n",
    "                self.id2token[len(self.id2token)] = token\n",
    "            else:\n",
    "                break\n",
    "\n",
    "    def get_word_distribution(self):\n",
    "        distribution = np.zeros(vocab_size)\n",
    "        for token, freq in self.frequency.items():\n",
    "            if token in dataset.token2id:\n",
    "                distribution[dataset.token2id[token]] = freq\n",
    "            else:\n",
    "                # 不在词表中的词按<unk>计算\n",
    "                distribution[1] += freq\n",
    "        distribution /= distribution.sum()\n",
    "        return distribution\n",
    "\n",
    "    # 将分词结果转化为索引表示\n",
    "    def convert_tokens_to_ids(self, drop_single_word=True):\n",
    "        self.token_ids = []\n",
    "        for sentence in self.tokens:\n",
    "            token_ids = [self.token2id.get(token, 1) for token in sentence]\n",
    "            # 忽略只有一个token的序列，无法计算loss\n",
    "            if len(token_ids) == 1 and drop_single_word:\n",
    "                continue\n",
    "            self.token_ids.append(token_ids)\n",
    "        \n",
    "        return self.token_ids\n",
    "\n",
    "dataset = TheLittlePrinceDataset()\n",
    "dataset.build_vocab(min_freq=1)\n",
    "sentences = dataset.convert_tokens_to_ids()\n",
    "# 遍历所有的中心词-上下文词对\n",
    "window_size = 2\n",
    "data = []\n",
    "\n",
    "for sentence in sentences:\n",
    "    for i in range(len(sentence)):\n",
    "        for j in range(i-window_size, i+window_size+1):\n",
    "            if j == i or j < 0 or j >= len(sentence):\n",
    "                continue\n",
    "            center_word = sentence[i]\n",
    "            context_word = sentence[j]\n",
    "            data.append([center_word, context_word])\n",
    "\n",
    "# 需要提前安装numpy\n",
    "import numpy as np\n",
    "data = np.array(data)\n",
    "print(data.shape, data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "30903b3d",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 需要提前安装PyTorch\n",
    "import torch\n",
    "from torch import nn\n",
    "import torch.nn.functional as F\n",
    "\n",
    "# 实现skipgram算法，使用对比学习计算损失\n",
    "class SkipGramNCE(nn.Module):\n",
    "    def __init__(self, vocab_size, embed_size, distribution,\\\n",
    "                 neg_samples=20):\n",
    "        super(SkipGramNCE, self).__init__()\n",
    "        print(f'vocab_size = {vocab_size}, embed_size = {embed_size}, '+\\\n",
    "              f'neg_samples = {neg_samples}')\n",
    "        self.input_embeddings = nn.Embedding(vocab_size, embed_size)\n",
    "        self.output_embeddings = nn.Embedding(vocab_size, embed_size)\n",
    "        distribution = np.power(distribution, 0.75)\n",
    "        distribution /= distribution.sum()\n",
    "        self.distribution = torch.tensor(distribution)\n",
    "        self.neg_samples = neg_samples\n",
    "        \n",
    "    def forward(self, input_ids, labels):\n",
    "        i_embed = self.input_embeddings(input_ids)\n",
    "        o_embed = self.output_embeddings(labels)\n",
    "        batch_size = i_embed.size(0)\n",
    "        n_words = torch.multinomial(self.distribution, batch_size * \\\n",
    "            self.neg_samples, replacement=True).view(batch_size, -1)\n",
    "        n_embed = self.output_embeddings(n_words)\n",
    "        pos_term = F.logsigmoid(torch.sum(i_embed * o_embed, dim=1))\n",
    "        # 负采样，用于对比学习\n",
    "        neg_term = F.logsigmoid(- torch.bmm(n_embed, \\\n",
    "            i_embed.unsqueeze(2)).squeeze())\n",
    "        neg_term = torch.sum(neg_term, dim=1)\n",
    "        loss = - torch.mean(pos_term + neg_term)\n",
    "        return loss"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "1d9da6c8",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[0.00000000e+00 5.43983724e-02 5.34295679e-02 ... 9.68804495e-05\n",
      " 9.68804495e-05 9.68804495e-05]\n",
      "vocab_size = 1078, embed_size = 128, neg_samples = 20\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "epoch-99, loss=3.0073: 100%|█| 100/100 [04:27<00:00,  2.68s/\n"
     ]
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAjMAAAGwCAYAAABcnuQpAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjguNCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8fJSN1AAAACXBIWXMAAA9hAAAPYQGoP6dpAABNfUlEQVR4nO3deVxU9f4/8NcwwwzbMIgKA7KIggsqam5p5pa7mUurWumv22K55i271e0b93aTrt3Myq73tnzNbpp9LTNvloqlqJlKCIpoioqCyqIIDOsAM5/fH+McGDaRGTgMvp6Pxzxqzjkz8+a4zMvPqhBCCBARERE5KRe5CyAiIiKyB8MMEREROTWGGSIiInJqDDNERETk1BhmiIiIyKkxzBAREZFTY5ghIiIip6aSu4DmZjabceXKFWi1WigUCrnLISIiokYQQqCwsBCBgYFwcWm47aXNh5krV64gODhY7jKIiIioCTIyMhAUFNTgNW0+zGi1WgCWm+Ht7S1zNURERNQYBoMBwcHB0vd4Q9p8mLF2LXl7ezPMEBEROZnGDBHhAGAiIiJyagwzRERE5NQYZoiIiMipMcwQERGRU2OYISIiIqfGMENEREROjWGGiIiInBrDDBERETk1hhkiIiJyagwzRERE5NQYZoiIiMipMcwQERGRU2vzG002lyJjJfJLyuHuqkR7L43c5RAREd222DLTRJ/9kobhf9+Dt3eelrsUIiKi2xrDTBO5Ki23rtxklrkSIiKi2xvDTBOpVZZbV2ESMldCRER0e2OYaSKpZabSJHMlREREtzeGmSZSK9kyQ0RE1BowzDRRVTcTx8wQERHJiWGmiazdTMZKhhkiIiI5Mcw0katSAYAtM0RERHJjmGkidjMRERG1DgwzTaSWZjMxzBAREcmJYaaJXLnODBERUavAMNNEbJkhIiJqHRhmmojbGRAREbUODDNNpFZxNhMREVFrwDDTRNaWmQp2MxEREcmKYaaJrFOz2c1EREQkL4aZJnKttjeTEJzRREREJJdWE2ZiYmKgUCiwdOlS6ZgQAtHR0QgMDIS7uztGjRqFlJQU+YqsxhpmAE7PJiIiklOrCDPx8fH46KOPEBUVZXN85cqVWLVqFdasWYP4+Hjo9XqMGzcOhYWFMlVaRaOqHmbY1URERCQX2cNMUVER5syZg48//hjt2rWTjgshsHr1arz66quYOXMmevfujfXr16OkpAQbN26s9/2MRiMMBoPNozlUb5nhWjNERETykT3MLFiwAFOmTMHYsWNtjqelpSErKwvjx4+Xjmk0GowcORIHDx6s9/1iYmKg0+mkR3BwcLPUrXRRwMUyO5stM0RERDKSNcxs2rQJR48eRUxMTK1zWVlZAAB/f3+b4/7+/tK5urz88ssoKCiQHhkZGY4tuhrOaCIiIpKfSq4PzsjIwJIlS7Br1y64ubnVe51CobB5LoSodaw6jUYDjUbjsDob4qp0QVmFmd1MREREMpKtZSYhIQE5OTkYMGAAVCoVVCoV4uLi8P7770OlUkktMjVbYXJycmq11shFreRmk0RERHKTLczcc889SE5ORlJSkvQYOHAg5syZg6SkJHTp0gV6vR6xsbHSa8rLyxEXF4dhw4bJVbYNtbRzNltmiIiI5CJbN5NWq0Xv3r1tjnl6eqJ9+/bS8aVLl2LFihWIiIhAREQEVqxYAQ8PD8yePVuOkmuxzmgyspuJiIhINrKFmcZYvnw5SktL8dxzzyEvLw9DhgzBrl27oNVq5S4NAOCq5GaTREREcmtVYWbv3r02zxUKBaKjoxEdHS1LPTejVikBMMwQERHJSfZ1ZpyZ+kbLDGczERERyYdhxg5Vm00yzBAREcmFYcYOVYvmcWo2ERGRXBhm7GBtmWE3ExERkXwYZuzAbiYiIiL5MczYQa3i1GwiIiK5MczYQc1uJiIiItkxzNhBGjPDlhkiIiLZMMzYwdW6N1MlZzMRERHJhWHGDmoOACYiIpIdw4wdqtaZYZghIiKSC8OMHVy5nQEREZHsGGbsoFZyo0kiIiK5MczYwVXFlhkiIiK5MczYgQOAiYiI5McwYwfrAOAKbjRJREQkG4YZO1gXzTOym4mIiEg2DDN24EaTRERE8mOYsUNVNxPDDBERkVwYZuyg5jozREREsmOYsQO7mYiIiOTHMGOHql2zOZuJiIhILgwzdpD2Zqo0yVwJERHR7Ythxg5V3UxsmSEiIpILw4wduAIwERGR/Bhm7FDVzcQwQ0REJBeGGTu4Wqdms2WGiIhINgwzduDUbCIiIvkxzNhBw24mIiIi2THM2MHaMmMWgMnMGU1ERERyYJixg6uq6vaxq4mIiEgesoaZtWvXIioqCt7e3vD29sbQoUPx448/SufnzZsHhUJh87jzzjtlrNiWdWo2wEHAREREclHJ+eFBQUF46623EB4eDgBYv349pk2bhsTERPTq1QsAMHHiRKxbt056jVqtlqXWulhnMwEcN0NERCQXWcPM1KlTbZ6/+eabWLt2LQ4dOiSFGY1GA71eL0d5N6VQKOCqVKDCJNjNREREJJNWM2bGZDJh06ZNKC4uxtChQ6Xje/fuhZ+fH7p164annnoKOTk5Db6P0WiEwWCweTQnaRXgSg4AJiIikoPsYSY5ORleXl7QaDSYP38+vv32W0RGRgIAJk2ahA0bNuDnn3/GO++8g/j4eIwZMwZGo7He94uJiYFOp5MewcHBzVq/dRBwuYmbTRIREclBIYSQtUmhvLwc6enpyM/PxzfffINPPvkEcXFxUqCpLjMzE6Ghodi0aRNmzpxZ5/sZjUabsGMwGBAcHIyCggJ4e3s7vP5Bb+7G1UIjflh8NyIDHf/+REREtyODwQCdTteo729Zx8wAlgG91gHAAwcORHx8PN577z38+9//rnVtQEAAQkNDkZqaWu/7aTQaaDSaZqu3Jm42SUREJC/Zu5lqEkLU242Um5uLjIwMBAQEtHBV9ZM2m2SYISIikoWsLTOvvPIKJk2ahODgYBQWFmLTpk3Yu3cvduzYgaKiIkRHR+P+++9HQEAALly4gFdeeQUdOnTAjBkz5CzbhnV6dgWnZhMREclC1jCTnZ2Nxx57DJmZmdDpdIiKisKOHTswbtw4lJaWIjk5GZ9//jny8/MREBCA0aNH46uvvoJWq5WzbBvWLQ3YMkNERCQPWcPMp59+Wu85d3d37Ny5swWraRo1N5skIiKSVasbM+NsXKUBwFxnhoiISA4MM3bibCYiIiJ5MczYid1MRERE8mKYsZN1NhMHABMREcmDYcZOruxmIiIikhXDjJ3YzURERCQvhhk7cQAwERGRvBhm7FS1aB6nZhMREcmBYcZO7GYiIiKSF8OMnTgAmIiISF4MM3ZSWzeaZJghIiKSBcOMnazdTAwzRERE8mCYsZO1m8nIMTNERESyYJixEzeaJCIikhfDjJ1crd1MbJkhIiKSBcOMnTTSOjMMM0RERHJgmLGTq4qzmYiIiOTEMGMnaQVgdjMRERHJgmHGTmp2MxEREcmKYcZOrlxnhoiISFYMM3aSds2u5NRsIiIiOTDM2EnaaJItM0RERLJgmLETBwATERHJi2HGTq7caJKIiEhWDDN20rCbiYiISFYMM3aS9mZiNxMREZEsGGbsxI0miYiI5MUwY6fqs5mEYKAhIiJqaQwzdrK2zABsnSEiIpIDw4yd1DZhhuNmiIiIWhrDjJ2s3UwA15ohIiKSg6xhZu3atYiKioK3tze8vb0xdOhQ/Pjjj9J5IQSio6MRGBgId3d3jBo1CikpKTJWXJvSRQEXy1IzbJkhIiKSgaxhJigoCG+99RZ+++03/PbbbxgzZgymTZsmBZaVK1di1apVWLNmDeLj46HX6zFu3DgUFhbKWXYtrtw5m4iISDayhpmpU6di8uTJ6NatG7p164Y333wTXl5eOHToEIQQWL16NV599VXMnDkTvXv3xvr161FSUoKNGzfKWXYtak7PJiIikk2rGTNjMpmwadMmFBcXY+jQoUhLS0NWVhbGjx8vXaPRaDBy5EgcPHiw3vcxGo0wGAw2j+YmTc/mmBkiIqIWJ3uYSU5OhpeXFzQaDebPn49vv/0WkZGRyMrKAgD4+/vbXO/v7y+dq0tMTAx0Op30CA4Obtb6geoL5zHMEBERtTTZw0z37t2RlJSEQ4cO4dlnn8XcuXNx8uRJ6bxCobC5XghR61h1L7/8MgoKCqRHRkZGs9Vu5aqy1MMxM0RERC1PJXcBarUa4eHhAICBAwciPj4e7733Hl566SUAQFZWFgICAqTrc3JyarXWVKfRaKDRaJq36BqsY2bYzURERNTyZG+ZqUkIAaPRiLCwMOj1esTGxkrnysvLERcXh2HDhslYYW3sZiIiIpKPrC0zr7zyCiZNmoTg4GAUFhZi06ZN2Lt3L3bs2AGFQoGlS5dixYoViIiIQEREBFasWAEPDw/Mnj1bzrJrsQ4AZpghIiJqebKGmezsbDz22GPIzMyETqdDVFQUduzYgXHjxgEAli9fjtLSUjz33HPIy8vDkCFDsGvXLmi1WjnLroXdTERERPKRNcx8+umnDZ5XKBSIjo5GdHR0yxTURFWL5nGdGSIiopbW6sbMOCNXazcTW2aIiIhaHMOMA6i5nQEREZFsGGYcQH1jnRkOACYiImp5DDMO4MoBwERERLJhmHEAdjMRERHJh2HGAaoGAHM2ExERUUtjmHEANVcAJiIikg3DjAO4KrnRJBERkVwYZhzAup0BBwATERG1PIYZB+BGk0RERPJhmHEATs0mIiKSD8OMA2i4azYREZFsGGYcoKqbiVOziYiIWhrDjAO4ctE8IiIi2TDMOABnMxEREcmHYcYBrOvMcMwMERFRy2OYcQCuAExERCQfhhkHYDcTERGRfBhmHKBqADBnMxEREbU0hhkH4ArARERE8mGYcQB2MxEREcmHYcYBOACYiIhIPgwzDuCq4tRsIiIiuTDMOIC1ZcbIbiYiIqIWxzDjABwATEREJB+GGQdQq7jRJBERkVwYZhzA2jJjMguYzAw0RERELYlhxgGsLTMAu5qIiIhaGsOMA1g3mgSAcoYZIiKiFsUw4wCuLlW3kQvnERERtSyGGQdwcVFIrTPsZiIiImpZsoaZmJgYDBo0CFqtFn5+fpg+fTpOnz5tc828efOgUChsHnfeeadMFddPmp5dyQHARERELUnWMBMXF4cFCxbg0KFDiI2NRWVlJcaPH4/i4mKb6yZOnIjMzEzp8cMPP8hUcf2qds42yVwJERHR7UUl54fv2LHD5vm6devg5+eHhIQEjBgxQjqu0Wig1+tburxbUrXZJFtmiIiIWlKrGjNTUFAAAPD19bU5vnfvXvj5+aFbt2546qmnkJOTU+97GI1GGAwGm0dL4GaTRERE8mg1YUYIgWXLlmH48OHo3bu3dHzSpEnYsGEDfv75Z7zzzjuIj4/HmDFjYDQa63yfmJgY6HQ66REcHNwi9XMAMBERkTwUQohW0S+yYMECbN++HQcOHEBQUFC912VmZiI0NBSbNm3CzJkza503Go02QcdgMCA4OBgFBQXw9vZultoBYPy7cTiTXYSNTw7BsPAOzfY5REREtwODwQCdTteo729Zx8xYLVq0CNu2bcO+ffsaDDIAEBAQgNDQUKSmptZ5XqPRQKPRNEeZDaoaAMyWGSIiopYka5gRQmDRokX49ttvsXfvXoSFhd30Nbm5ucjIyEBAQEALVNh4VTtnt4qGLiIiotuGrGNmFixYgC+++AIbN26EVqtFVlYWsrKyUFpaCgAoKirCCy+8gF9//RUXLlzA3r17MXXqVHTo0AEzZsyQs/RaqmYzsWWGiIioJcnaMrN27VoAwKhRo2yOr1u3DvPmzYNSqURycjI+//xz5OfnIyAgAKNHj8ZXX30FrVYrQ8X142wmIiIiecjezdQQd3d37Ny5s4WqsY91NhPHzBAREbWsVjM129lJA4DZzURERNSiGGYcxDpmht1MRERELYthxkE4ZoaIiEgeDDMOwm4mIiIieTDMOIg0NZvrzBAREbUohhkHcWU3ExERkSyaFGbWr1+P7du3S8+XL18OHx8fDBs2DBcvXnRYcc7EVXVjaja7mYiIiFpUk8LMihUr4O7uDgD49ddfsWbNGqxcuRIdOnTA888/79ACnYWGLTNERESyaNKieRkZGQgPDwcAbN26FQ888ACefvpp3HXXXbVW871dsJuJiIhIHk1qmfHy8kJubi4AYNeuXRg7diwAwM3NTdpX6XbjemMAsLFaN9PVQiMe/eQwvj9+Ra6yiIiI2rwmtcyMGzcOTz75JPr3748zZ85gypQpAICUlBR07tzZkfU5DXUdu2ZvOXoJB85ew7UiI+6NCpSrNCIiojatSS0zH374IYYOHYqrV6/im2++Qfv27QEACQkJmDVrlkMLdBbWlpmKai0zxy7lAwDOZBeipLxSjrKIiIjavCa1zPj4+GDNmjW1jv/lL3+xuyBnpa5jo8mk9HwAgFkAKVcMGNTZV47SiIiI2rQmtczs2LEDBw4ckJ5/+OGH6NevH2bPno28vDyHFedMau7NlGMow5WCMun8sYx8OcoiIiJq85oUZl588UUYDAYAQHJyMv74xz9i8uTJOH/+PJYtW+bQAp1Fze0MkmqEl5rPiYiIyDGa1M2UlpaGyMhIAMA333yDe++9FytWrMDRo0cxefJkhxboLGpOzbaGlxBfD6RfL5HGzxAREZFjNallRq1Wo6SkBACwe/dujB8/HgDg6+srtdjcbqr2ZrINM3OGhAAAMq6XIrfIKEttREREbVmTwszw4cOxbNkyvPHGGzhy5Ig0NfvMmTMICgpyaIHOQpqaXSlgNgscv1QAALg7oiO6dvQEAOkYEREROU6TwsyaNWugUqnw9ddfY+3atejUqRMA4Mcff8TEiRMdWqCzqN7NdO5qEYqMlXB3VaKbvxf6BvsA4LgZIiKi5tCkMTMhISH4/vvvax1/99137S7IWbnemJptrDQj8UZo6ROkg0rpgn7BPthy9DLHzRARETWDJoUZADCZTNi6dStOnToFhUKBnj17Ytq0aVAqlY6sz2lUn5ptnYbd70aLTFSQ5b/HMvIhhIBCoZChQiIiorapSWHm7NmzmDx5Mi5fvozu3btDCIEzZ84gODgY27dvR9euXR1dZ6unrtbNlFQjzPQM0MJVqUBeSQUyrpcipL2HTFUSERG1PU0aM7N48WJ07doVGRkZOHr0KBITE5Geno6wsDAsXrzY0TU6BeuYmeJyE37PKgRQFWY0KiUiA7wBAEnsaiIiInKoJoWZuLg4rFy5Er6+Vcvzt2/fHm+99Rbi4uIcVpwzkaZmV5phMgt01GoQoHOTzlsHAXMlYCIiIsdqUpjRaDQoLCysdbyoqAhqtdruopyRtWXGql+wj83YmL7Vxs0QERGR4zQpzNx77714+umncfjwYQghIITAoUOHMH/+fNx3332OrtEpqOsIM9VZW2ZOXCmQVgkmIiIi+zUpzLz//vvo2rUrhg4dCjc3N7i5uWHYsGEIDw/H6tWrHVyic7B2M1nVDDNdOnhCq1GhrMKMM9m1W7WIiIioaZo0m8nHxwffffcdzp49i1OnTkEIgcjISISHhzu6PqdhXWcGABQKICpIZ3PexUWBqGAdfjmbi2MZBegVqKv5FkRERNQEjQ4zN9sNe+/evdL/r1q1qskFOSuliwIKBSAEEN7RC1o311rX9A3yuRFm8jH7xp5NREREZJ9Gh5nExMRGXXe7LginUCigVrrAWGmWxsfUJM1o4vRsIiIih2l0mNmzZ09z1tEmWMNMzfEyVtbjZ7ILUVJeCQ91kxdgJiIiohuaNADYUWJiYjBo0CBotVr4+flh+vTpOH36tM01QghER0cjMDAQ7u7uGDVqFFJSUmSquGHe7paupYGd29V53t/bDf7eGpgFcPKKoSVLIyIiarNkDTNxcXFYsGABDh06hNjYWFRWVmL8+PEoLi6Wrlm5ciVWrVqFNWvWID4+Hnq9HuPGjatznRu5vftwP6x6qC966L3rvSbQxx0AkFtc3lJlERERtWmy9nPs2LHD5vm6devg5+eHhIQEjBgxAkIIrF69Gq+++ipmzpwJAFi/fj38/f2xceNGPPPMM3KUXa/BYb4YHObb4DVeGsstLymvbImSiIiI2jxZW2ZqKigoAABpm4S0tDRkZWVh/Pjx0jUajQYjR47EwYMH63wPo9EIg8Fg82hNPNSWXcWLjSaZKyEiImobWk2YEUJg2bJlGD58OHr37g0AyMrKAgD4+/vbXOvv7y+dqykmJgY6nU56BAcHN2/ht8hTzZYZIiIiR2o1YWbhwoU4fvw4vvzyy1rnak73FkLUOwX85ZdfRkFBgfTIyMholnqbykNjaZkpYssMERGRQ7SKucGLFi3Ctm3bsG/fPgQFBUnH9Xo9AEsLTUBAgHQ8JyenVmuNlUajgUajad6C7SC1zBjZMkNEROQIsrbMCCGwcOFCbNmyBT///DPCwsJszoeFhUGv1yM2NlY6Vl5ejri4OAwbNqyly3UIzxsDgIvL2TJDRETkCLK2zCxYsAAbN27Ed999B61WK42D0el0cHd3h0KhwNKlS7FixQpEREQgIiICK1asgIeHB2bPni1n6U1mHQDMMTNERESOIWuYWbt2LQBg1KhRNsfXrVuHefPmAQCWL1+O0tJSPPfcc8jLy8OQIUOwa9cuaLXaFq7WMaSWGY6ZISIicghZw4wQ4qbXKBQKREdHIzo6uvkLagFsmSEiInKsVjOb6XZhHQBczAHAREREDsEw08KsU7M5AJiIiMgxGGZamLSdAVtmiIiIHIJhpoV5qDk1m4iIyJEYZlqYp4YDgImIiByJYaaFWVtmKkwCxkq2zhAREdmLYaaFed6Ymg0AJVxrhoiIyG4MMy1MpXSBRmW57cXsaiIiIrIbw4wMrKsAl3AQMBERkd0YZmRgXQWYC+cRERHZj2FGBlWrALNlhoiIyF4MMzKoWgWYLTNERET2YpiRgbVlhmvNEBER2Y9hRgbWhfPYzURERGQ/hhkZsGWGiIjIcRhmZGAdM1PElhkiIiK7MczIQGqZ4dRsIiIiuzHMyIA7ZxMRETkOw4wMuHM2ERGR4zDMyMC6nQFnMxEREdmPYUYG1u0M2DJDRERkP4YZGVRtZ8AwQ0REZC+GGRlUbWfAbiYiIiJ7MczIwEvDqdlERESOwjAjA07NJiIichyGGRlwajYREZHjMMzIwNoyU2ESMFaydYaIiMgeDDMysE7NBoASrjVDRERkF4YZGbgqXaBWWW59MbuaiIiI7MIwIxNpRhMHARMREdmFYUYm1q4mLpxHRERkH1nDzL59+zB16lQEBgZCoVBg69atNufnzZsHhUJh87jzzjvlKdbBqlYBZssMERGRPWQNM8XFxejbty/WrFlT7zUTJ05EZmam9Pjhhx9asMLmU7UKMFtmiIiI7KGS88MnTZqESZMmNXiNRqOBXq9voYpajrVlhmvNEBER2afVj5nZu3cv/Pz80K1bNzz11FPIyclp8Hqj0QiDwWDzaI2sC+exm4mIiMg+rTrMTJo0CRs2bMDPP/+Md955B/Hx8RgzZgyMRmO9r4mJiYFOp5MewcHBLVhx47FlhoiIyDFk7Wa6mYcfflj6/969e2PgwIEIDQ3F9u3bMXPmzDpf8/LLL2PZsmXSc4PB0CoDjQdbZoiIiByiVYeZmgICAhAaGorU1NR6r9FoNNBoNC1YVdNUzWZiywwREZE9WnU3U025ubnIyMhAQECA3KXYjTtnExEROYasLTNFRUU4e/as9DwtLQ1JSUnw9fWFr68voqOjcf/99yMgIAAXLlzAK6+8gg4dOmDGjBkyVu0Y3DmbiIjIMWQNM7/99htGjx4tPbeOdZk7dy7Wrl2L5ORkfP7558jPz0dAQABGjx6Nr776ClqtVq6SHcZTw0XziIiIHEHWMDNq1CgIIeo9v3PnzhaspmVZtzNgywwREZF9nGrMTFvCAcBERESOwTAjk6rtDNjNREREZA+GGZlIi+axZYaIiMguDDMykQYAs2WGiIjILgwzMuHUbCIiIsdgmJGJddG8CpOAsZKtM0RERE3FMCMT69RsACjhWjNERERNxjAjE1elC9Qqy+0vZlcTERFRkzHMyMjrxiDgEg4CJiIiajKGGRlZu5q4cB4REVHTMczISFprhi0zRERETcYwIyPrKsBFbJkhIiJqMoYZGVW1zDDMEBERNRXDjIyqxsywm4mIiKipGGZkVDWbiS0zRERETcUwIyNp52y2zBARETUZw4yMrGNmbjY1WwiBj/adww/JmS1RFhERkVNRyV3A7cy6P9PNds7el3oNK374HV4aFSb11kOhULREeURERE6BLTMyauzO2V8eTgdgmcJdUFrR7HURERE5E4YZGXlqrN1M9bfM5BjKEHsqW3qeWVDW7HURERE5E4YZGVmnZjfUMrM54RJMZiE9z2KYISIissEwI6ObDQA2mwW+PGLpYtLc2GGbLTNERES2GGZkJE3NrmcA8P6z13AprxTebipM7RsIAMgsKG2x+oiIiJwBZzPJSNrOoJ6WGevA35l3BKGjVgOALTNEREQ1sWVGRtIA4DpaZqoP/J01OAR6bzcAHDNDRERUE1tmZNTQ1GzrwN8Boe3QXa9FbpERALuZiIiIamLLjIysi+ZVmASMlVWtM9UH/s4eHAIA0OssLTOZBWUQQoCIiIgsGGZkZJ2aDQAl1daaqT7wd0pUAICqMFNSbkLhTbY/ICIiup0wzMjIVekC9Y0p18XVupr+Lz4DgGXgr5urJfB4qFXQubsC4LgZIiKi6hhmZOYpLZxnaZkpqzBhz+kcAMCM/p1srg2o1tVEREREFgwzMqva0sDSMvPruVyUlJug93ZDVJDO5lprV1MWBwETERFJZA0z+/btw9SpUxEYGAiFQoGtW7fanBdCIDo6GoGBgXB3d8eoUaOQkpIiT7HNRFpr5kbLzK6TlunYYyP9au2OzZYZIiKi2mQNM8XFxejbty/WrFlT5/mVK1di1apVWLNmDeLj46HX6zFu3DgUFha2cKXNx7oKcJGxEmazwO4ba8uMi9TXulbv7Q6AY2aIiIiqk3WdmUmTJmHSpEl1nhNCYPXq1Xj11Vcxc+ZMAMD69evh7++PjRs34plnnmnJUptNVctMJY5dysfVQiO0GhWGdmlf69oAH7bMEBER1dRqx8ykpaUhKysL48ePl45pNBqMHDkSBw8erPd1RqMRBoPB5tGaWadnFxtNUhfTyO4dpVlO1QXouAowERFRTa02zGRlZQEA/P39bY77+/tL5+oSExMDnU4nPYKDg5u1Tnt5aapaZmJPWruY/Ou8tmrMDAcAExERWbXaMGNVcxCsEKLWsepefvllFBQUSI+MjIzmLtEu1jEzKVcMOJtTBJWLAqO6+9V5rV5nGTNjKKuUZj8RERHd7lptmNHrLQNga7bC5OTk1GqtqU6j0cDb29vm0ZpZx8zsSrG0ytzZpb20OF5NXhoVtDdacrIM7GoiIiICWnGYCQsLg16vR2xsrHSsvLwccXFxGDZsmIyVOZZ1f6bSCsvU7Pq6mKz0HDdDRERkQ9bZTEVFRTh79qz0PC0tDUlJSfD19UVISAiWLl2KFStWICIiAhEREVixYgU8PDwwe/ZsGat2LOvO2VaNCTOpOUWc0URERHSDrGHmt99+w+jRo6Xny5YtAwDMnTsXn332GZYvX47S0lI899xzyMvLw5AhQ7Br1y5otVq5SnY46wrAANC7kzcCfdwbvD6AqwATERHZkDXMjBo1CkKIes8rFApER0cjOjq65YpqYdV3zh7Xs/ZCeTVZBwGzZYaIiMii1Y6ZuV1YBwADN+9iArjWDBERUU0MMzKzDugNbe+BngE37z7Tc38mIiIiG7J2MxHQu5MO78/qjx56bYPr51hx4TwiIiJbDDOtwH19Axt9bcCNzSbzSipQVmGCm6vyJq8gIiJq29jN5GS83VVwvxFgOG6GiIiIYcbpKBSKal1NDDNEREQMM05IWgXYwHEzREREDDNOqL4ZTblFRuQWGeUoiYiISDYMM06orrVmcouMmLB6Hya9tx9lN/Z5IiIiuh0wzDihgDpWAV69OxXXisqRU2hEypUCuUojIiJqcQwzTqhmy8zZnEJsPJIunU9Mz5ejLCIiIlkwzDihmmNmYn74HSazgFpl+eVkmCEiotsJw4wTsnYzXSsyYu/pHPz0ew5ULgq8PjUSAJCUkS9jdURERC2LYcYJtfNwlVphXt6SDACYMyQE0/t1gosCuJxfimwD16AhIqLbA8OME6q5cJ7WTYUlY7vBU6NCN3/LZpXsaiIiotsFw4yT0nu7Sf+/aEw4fD3VAID+Ie0AAIkZebLURURE1NIYZpyUtWUm2Ncdc4d1lo73D/EBACSxZYaIiG4TDDNO6t6oQHTycceKGX2gUVXtnH3HjTBz/FIBKk1mmaojIiJqOSq5C6CmGRvpj7GR/rWOd+ngBa2bCoVllTidXYhegToZqiMiImo5bJlpY1xcFOgX7AOAg4CJiOj2wDDTBvW/EWa43gwREd0OGGbaoH43xs0kpnNGExERtX0MM21Qv2DL9OxzV4tRUFIhczVERETNi2GmDfL1VKNzew8AQNKlfHmLISIiamYMM22UdfE8rjdDRERtHcNMGyXNaOJKwERE1MYxzLRR/aVBwPkQQshbDBERUTNimGmjeui9oVG5oKC0AmnXiuUuh4iIqNkwzLRRapUL+nSyrP7L9WaIiKgtY5hpw6xdTftTr8lbCBERUTNimGnDpkQFAgD+e+wKMq6XyFwNERFR82jVYSY6OhoKhcLmodfr5S7LafQL9sHdER1QaRb4V9w5ucshIiJqFq06zABAr169kJmZKT2Sk5PlLsmpLBoTAQDY/NslZBWUyVwNERGR47X6MKNSqaDX66VHx44d5S7JqQwO88XgMF+Um8z49z62zhARUdvT6sNMamoqAgMDERYWhkceeQTnz59v8Hqj0QiDwWDzuN0tGhMOAPjySDquFhplroaIiMixWnWYGTJkCD7//HPs3LkTH3/8MbKysjBs2DDk5ubW+5qYmBjodDrpERwc3IIVt07Dwzugb7APyirM+PRA2k2vN5sFyivNdn/u9eJyfP7rBZSUV97S6xz1+UREdHtQCCdaHra4uBhdu3bF8uXLsWzZsjqvMRqNMBqrWh8MBgOCg4NRUFAAb2/vliq11dl9MhtPfv4bPNVKHHhpDNp5quu8TgiB+V8kYM/vV/Hk3WFYNCYC7mplkz5z8ZeJ2HbsCmYPCcGKGX0a9ZqM6yV4cv1vuF5Sjl1LR9RbJxERtW0GgwE6na5R39+tumWmJk9PT/Tp0wepqan1XqPRaODt7W3zIOCenn7oGeCN4nIT1h28UO913xy9jJ0p2Sg3mfHPvecwdlUcdqZk3fKWCAUlFdiRkgUA2PxbBi7l3XxqeFJGPmb88xeczi7E1UIj/nv8yi19JhER3Z6cKswYjUacOnUKAQEBcpfidBQKhTR2Zt0vacgsKK11TW6REX/bfhIAML1fIDr5uONyfime+U8CnvgsvlGBxOq/x69IXUUVJoEP9zQ8+HjHiSw88tGvuFZUDi+NCgCw5ejlRn8eERHdvlp1mHnhhRcQFxeHtLQ0HD58GA888AAMBgPmzp0rd2lOaWIvPXoGeKOwrBKPf3oE+SXlNuff+P4k8ksq0EOvxdsP9kXsshF4blRXuCoV2HP6Kh756BAMZRWN+qyvEy4BACb3sawLVF/rjBACn+w/j2c3JKCswozR3Tti++LhULookJSRj/NXi+z8qYmIqK1r1WHm0qVLmDVrFrp3746ZM2dCrVbj0KFDCA0Nlbs0p+TiosDHjw+Av7cGqTlF+MP631BabgIAxJ25iq1JV6BQAG/dHwVXpQs81Cosn9gDO5aOQLCvOy7llSL6u5Sbfs7ZnEIkZeRD6aLAX+7rjeHhloX7Ptxztta17+w6g79tPwUhgEfvDMHHjw9EaHtP3B3RAQCwNZGtM0RE1LBWHWY2bdqEK1euoLy8HJcvX8Y333yDyMhIuctyakHtPPD5E0Pg7aZCwsU8LNx4FIayCrz6rWUxwnnDOqNfsI/Na7p29MLqh/vBRQFsSbyMbccaHsvydYIlgIzu3hEdtRosHVu1cF/1bRXW/JyKNTcCzsuTeuCNab2hUlp+S87o3wkA8G3S5Vser0NERLeXVh1mqHl012vx6bxB0Khc8NPvOZj47j5cyitFoM4Nfxzfvc7XDAj1xcIbqwm/+m0yLufXHnMDACazwLeJli6mBwZYpsUP7Owrbavwz72W8PLJ/vP4x64zAIBXJvfAMyO7QqFQSO8zPlIPT7USGddLkXAxzzE/OBERtUkMM7epQZ19sWb2HVC6KHDlxjYHf5vRWxp8W5fFY8LRP8QHhWWVeP6rJJjMtVtM9qVeRbbBiHYerhjTw086vuSeqtaZf+w8jb9tPwUAWDauG54e0bXW+7irlZjY2zLQewu7moiIqAEMM7excZH+iJnZB2qVC2YNDsaYHv4NXq9SumD1w/3gqVbiSNr1OrdHsA78ndavE9Sqqt9e1VtnrF1Lz47qKs2wqsvMOyxdTduPZ8JYabrlny+/pBzHMvJxKtOAC9eKkVVQhoKSCpjrCGFtjRACa35Oxf98dwIVJi5ASERtW/3/DKfbwkMDg3FvVAA81I37rRDa3hOv39cLy78+jlW7zsBNpcSjd4ZCrXJBQUkFYlOyAQAPDAiq9dol90Rgf+o1AMD/u6szlk/obtO1VNOdXdpD7+2GLEMZ9vx+FRN7W2ZGCSGwOeESfj2XC19PNfy9NfD3dkMHLw0u5pbgaHoejqbn4fzV4jrf11WpQKCPO4LauSPIxwOdO3jiwYFB6OCladQ9cAb/3lfVjdfJxx3PjKzd+kVE1FY41QrATXErKwhS4wghsHhTEv57YyBwWAdP/GlSD+QYyvDadynoodfixyV31xlU1h+8gPJKM568O6zBIGMV8+Mp/DvuPCb08se/HxuIbEMZln99HHFnrjaqVn9vDUxmoKzChJLyStTXKOPtpsKLE7pj9pBQKF1uXldr9kNyJp7bcFR67ubqgtjnRyLY10PGqoioORzLyMeOlCzcf0cnhPtp5S7HoW7l+5thhpqk0mTG//12CatiT+NakWW9GrXSBeUmM167NxJ/GB7mkM85nVWICav3wVWpwF+n9cbfd/yO/JIKqFUu+H/DOkMAyDaUIdtQhpxCI/TebrgjpB3uCPVB/+B2NtshCCFQbjLjaqERl/JKcTmvFJfySrEzJQsnMy0bkvbu5I2/TuuNO0LaOaT+lpaYnodHPjoEY6UZc4eG4nR2IQ6dv45R3Tti3bxBjQqQt4vMglK8sPkY3F2VeOehftC5uzrsvc1mgT2nc9DNX8sQSQ4nhMDe01fx733ncOj8dQBAoM4N2xff3aa2gGGYqYZhpnkVGSuxdu9ZfLI/DcZKM1QuChx65R6HdtlMfm+/FDYAS+B496F+iPB3zL9CTGaBDYcv4u2dp1FYZtkUc/aQELw2JbLJ+1LVJz23BJ4aJdo3Q5dWxvUSzPjnL7hWVI57evjho8cH4kJuMSat3o9ykxlrZvfHvVGBDb5HjqEMK3eeRg+9Fo8P7Wwz7sleBaUViD2ZjVHdO8repZdypQBPfBaPbINlH7c+nXT4/InBjfoiyLhego/2ncd9/QIxqLNvndesij2D939KhVajwr8eG4C7wjvUeV1ieh78vd0Q6OPe9B+GbhuFZRXYfjwT//tLGs5kWxYUVbko4O3uiuvF5RjZzfKPFhcnb122YpiphmGmZVzOL8Vnv6Shh94b99cxXsYen+w/j79tPwWliwILRnXFwjERDv2StbpWZMRbP/4uDWLu7q/Fh3PuQLifV61rrxeX42JuMVyVLnBzdYFaqYS7WomO2rq/pIUQ+Ffcefx9x+8AgAg/LwwO88WQLu1xZ5gv/Lzd7Kr9enE5Hv73r0jNKUJkgDc2zx8Kzxsz01bvPoPVu1PRUavB7mUj622BOJtTiLn/Gy9Nu+/a0RN/nda73i/i6sorzcg2lCGonXudrT+/ZxnwzH8ScDG3BB28NFj9cD8Mj6j9vsZKEw6kXoMQgI+HK3w8XKFzV6Odh6u0BpG99pzOwcINR1FcbkLXjp7IL6lAbnE5eui1+OLJIQ0GrdTsQjz66WFkG4xwc3XBhifvxIBQ21a8XSlZePo/CdJzlYsCb90fZTOOLKugDK9vO4GdKdnwVCvx/qz+uKdnwwPw65NfUo7lXx/H1SIjFo4Ox5gefq2qBU4IgaSMfHydcAml5SaM7+WPUd394OZa+x8K+SXluFpohNbNFTp3V7i5urSqn6UuFSYzth/PREh7D/QP9qmz3tJyE+LO5CDCX4uuHWv/fdIQk1lgf+pVbDl6GTtTsmC8sU2Ml0aF2UNC8P/u6oyC0gpM//AXlFWY8cdx3bDoxuxRZ8cwUw3DjPOrNJnxZXwG+gX5oE+Qrtk/7+DZa1i8KQnXiozwUCsRM7MPpvWzzKxKTM/D579exPbjmSivY5bQkDBf/G16b5tWowqTGf/z3Ql8eSSj3s8cENoOU6MCMDkqAH5aS7AxVppw/FIBDp/PRcb1UoyN9MeYHn42Y3rMZoHNCRl468ffkVdSAb23G7YuuAt6XVU4MlaaMGn1fpy/Vow5Q0LwZh07mB8+n4unPv8NhrJKhPh6oNhYidxiS/fhvVEB+POUSJv3tErNLsRX8RnYkngZ14vL0TdIhxcn9MBd4e2lv9S/P34FL24+jtIKE1wUgFkACgXw7MiueH5cN7gqXVBWYcJX8RlYu/ccsgxltT7HU63Eff0C8cigEEQF6Zr8BbfxcDpe++4ETGaBYV3bY+2jA5BjKMOcTw4jp9CIrh09sfGpO+FfR7g8lpGPeeuOIK+kAq5KBSpMAjp3V2yePxTdbvx6n80pwvQPf0GRsRKP3hmCgtJKaWzZknsisPieCGw4fBErd5xGkbFSem+FwrJw5FN3d7mlny3tWjH+8Fk8zl+rGuw+ILQdXprYA4PD6m41skd5pRkrd/yO749n4tE7Q/DUiC7QqOpuvSworcDWxMv48kg6fs8qtDnnpVFhfKQ/xvfyx9WiciSm5yEpPd/m5wAsXdc+Hq6Yd1dnPDeq/pmPQgiHhB6TWSA1pxDHMwpw7FI+jl8qQPr1Ekzta/kzUDOAFZRWYOHGo9LEhh56LWYNDsH0/p2gc3fFicsF2BSfju8Sr6DQWAkvjQrrnxhcKwDXV8vGI+n44KdU5BQapePhfl54aGAQHhkcAm+3qn+YbP4tAy9+fRwKBfDFH4Y06h8hNVlnPrrewj8chBDYmZKN0T061vt7oakYZqphmKGmyCksw5Ivk/Dr+VwAwNS+gUjPLcaxSwXSNXpvN5hvjMMxVphRVmmCEJZ/iT89ogsWjYlApdmM5zZY/rJzUQD/c28kpvXrhCMXruNI2nUcTstFyhUDrH8KFQpLIHJRKHA0PQ9lFbaBKbS9B+YO7YwHBwbhYm4JXvvuBBLT8wFYWpLen9Uf3fW1u99+PZeLWR8fAgD8dVov3BHSDuF+XnBzVeK/x67gj/93DOUmM+4I8cEncwdB6aLAql2n8Z9DF2G+8TN1aueOTj7uCPRxh7+3BofOX693QcOhXdrjj+O7IfZkNv697zwAYHh4B6x8IApr9pzFxsPpACxfvOMj/fHpgTTpL2w/rQYBOjfkl1Ygv6QChrIKVP9bqmeAN2YNDsa9UYHwbeT4gNTsQqz+KRXbj2cCAO6/I0halgCwhILZHx9CZkEZOrf3wEsTe2BQmK/USvPruVw8uT4exeUm9A3S4Z+PDsDCjUeRmJ4Pvbcbvn52KHTurpj24S84f7UYgzv7YsNTQ6BUKPD2rtNYu9eyjIG/t0bq2uof4oM3pvXGxiPp0v14cEAQ/jajd6O+FA6dz8X8LxKQX1KBQJ0bJvYOwMYjF6XfM6O6d8SC0eEYGNqu0V/0xkoT8ksq4KfV1HrNxdxiLPoyEcer/Rno0sETf5nWC3dHdARgCdeHzudic8Il/JCcKbUiaFQumNInAO291Nh+PFNa26ou3m4qFJebaq1j9ecpPfHk3V1qXZ+UkY+FG48irIMn3n6gb52h+2ZyDGXSr0P14FBdD72lpdbaspKeW4In1sfjbE4R3FxdIARsft5gXw+czanaW87N1QVlFWZ4aVT4zx8Go38D4/JOXC7Aq1tP4FhGPgCgnYcr7usbiPsHBKFPp/rD/EtfH8dXv2Wgg5ca3y+6G3qdG4QQyC0uR1ZBGTp38KxzLbHSchP+ve8c/hV3DmYB9NRr0auTDn066dA3yAc9A7R1fuaJywX4639P4siF63hpYg88O8qxsyYZZqphmKGmMpkF3tt9Bh/sOSt9maqVLri3bwDmDeuMqCAfm+sv5ZUgelsKdp/KAQAEtXOHh1qJM9lFcHdV4oNZ/TE2snZXQrahDNuPZ+K/x69IwcSqvacag8N84afVYGvSFRSUWjb69FQrUVphgllY/v/5cd0wd1jnBv9F9eLmY9h8owsNsASn4HYeSL+xxcSEXv5475H+Nv/6PHG5AP/z3QkcrVGXldJFgTE9/PDIoGD07qTDv+LOYcOh9FqtVs+M7IIXx3eXuoq2H8/En745jsJqrROBOjc8OzocDw4IsqnBZBaIv3Adm46k44cTWdJu7AoF0DtQhxHdOmBEREfcEdqu1s9/JrsQ7/+Uiu3JmdKv4dKxEVhyT0Stv5wzrpdg9ieHkHG9anXrLh080TfYB9uTM1FeacbQLu3x8dyB8NKokF9Sjgf/Zena69LBE507eOLn33Og93bDfxcNt+lyrN4i5KVR4aWJVTPnhBBYf/AC/vr9SZiFJeTc08MPOg81fNwtXW1eGhXc1Uq4u1oee89cxavfJqPCJNA32AcfPz4Aflo3ZBvK8P5PqdgUnyGFgR56LeYMsbQWaN3q7mIsLKvAF4fS8emBNFwrMqKbvxem9euE+/oGItjXA98fv4I/fZOMImOlpaVkWGd8cSgd14osX/xT+gQgwt8LXydcwqW8qvvXQ6/FI4OCMaN/EHQels82mwWOpufh++OZ2J96FYE+7ugf7IP+oe3QL8gH7TzVEEKguNyEgtIKfJNwCatiLcsMrH64H6bf2OoEAPaduYr5XySg5MYec76earz7cD+M7Naxzp+zOiEEEi7mYf2vF/FjciYqb9wvT7USfYIsX+JRQT5QugCvfnsCucXl8FQrsWJmH3TyccfT/0nA9eJy6L3d8MncgQhu54GtSbYtUWqlCyb01uORQcHoF+yDJz6Lx+G069BqVPjPk0NqbR1TZKzEql1n8NnBNJgFoNWo8Mfx3TB7SGijutbLKkyY8c+DOJVpQLCvO9xdLauol1ZY7o+HWompUYF4ZHCw9Nk/JGdhxQ+n6l3VHbDMWJ3RvxNm9O+EYF8PXCsy4p1dp7EpPgNCWIJafQug2oNhphqGGbLXvjNX8fH+8xgS5otHBofcdPDqrpQsRG9Lkf712VGrwf/OHdSoLrKM6yWIPZkNV5UL7gzzRbifl/SlW1JeiS1HL2PdL2k4d2MNnYa6gGoqMlZizc9ncTQ9D6nZhcgrqdoBfd6wznjt3sg6p6ULIXA53zL7y/rfKwVlCG3vgZn9O9Ua73M5vxTv7T6DrxMuwc1ViZUPRNU58Djjegle2HwMVwuNePLuLrh/QKebtkjkl5Tj28TL+DrhElKuGGzOuSoV8HZzhdZNBa2bK1yVCiRm5EshZmIvPRbfE4HIwPr/Hsg2lGHt3nM4dD63VtfIuEh/fDDLNuxlFpTigbW/Sl8EaqUL/m/+0FpfUoCldWdf6lXMHdq5zl+vfWeuYsHGo9Ig9MaY0icA7zzUt1b3R9q1Yqzdexbbjl2RWmo81EpM6KVH146eCPb1QIivB9p5qLE5IQOf/3qx3s8N9/OSWhgGhrbD+7P6I9DHHYayCqzadQaf/3rBZskDrUaFqf0C8eCAIPSrZwzJrRBC4K/fn8S6Xy5A5aLAp/MGYWS3jvgu6TJe2HwMFSaBu8LbI6+4AiczDVAogAWjwrF0bES946xyDGV45dsT2H0qWzo2MLQdHh/WGRN76WsFh2xDGZZsSpRmDildFDCZBfp00uGTuQNtuiWFEEjMyEfG9RKMiOhoM6i8pLwS89bF40jadWjdVPjiD0Pg5abCL2ev4UDqNfx6LlcK+FP7BuK1KT1veTzdhWvFmPrBAZt/KCgUll8XQ7Vf4x56LbRuKsRfsLSudvJxxyuTe6J3J28kXy5A8uUCnLhcgKMX86UwBFhaU89kFUrvf1/fQPxpUo9mGcTOMFMNwwzJodhYiQ/3nMW5q0X4n6m90MmBf9DNZoFDabnQqJSN6nuvixAC14rKkZpdCHe10iFfOjVdyS+FykVh9+Dm+uQYyrA/9Rr2pV7F/tRruH5jjE9Nk3pbQkzPgFv7819QUoHfLl7HkQvX4alW4blRXev8cjx3tQgP/utXXC8ux9/v74OHB4U06ecBLF9E//dbBnKLypFfWo78EktXW3F5JcoqTCgtN6G0wgSV0gXzR3TB0rHdGpy5UlBSgW+OXsKGwxelAFyfrh098ewoywDi3aey8V3SZRw8lwtxY4xTfQHh5BUD3t19BuWVZszo3wkTeukdPgvQbBZY+lUSth27Ag+1EnOGhOCTA2kQwhLoVz3UD2Yh8Mb3J7HhRpfd4DBfLBgdjqFd2kvhRAiBbxMvI3pbCgxllXBVKjCzfxAeGxqK3p0a/seGySzw3k+p+ODnVAhhCcerHu7b6AVHrYqNlZi37gjiL+RBoQBqfgOHdfDEX6t13TXFqUwDkjLyEejjjuB27ujUzh1qpQuOpF3HV/EZ2F6jC3D+yK6YP7Jrnb9uxcZK7DiRhS2Jl6TfD4BlVunrU3vVO6PPERhmqmGYIWr7zGaBLEMZDGUVKCyrROGN//YM8JYG5zanq4VGXM4vrbNFxtGEEDAL3NLijkIIHE67jkM3BpNn5JUg43oJsg1l6BPkg2dHdsX4SP9awSjHUIZdJ7PRzV/bLAOKb0V5pRlPfBaPA2evScfmDg3F61N72dT9XdJlvLIlGcU3up60biqM7WkZPL818TJ++t3SDdynkw5vPxiFHvpb+15IuJiHjOsluK9vYJOnQBcZKzHvf4/gt4t5UKtcMKhzO9wV3gHDwzugV6Cu2RfuLCipwHfHLuNqoREPDwpGULvGrYWUWVCKnSey0N5Lg8l9Apq9ToaZahhmiIjqZjYLp1qTpMhYidkfH8LxSwVYNq4bFo0Jr7NFMe1aMT7efx67UrKlcT1WaqULloyNwDMjujhsun9TGCtNOJVZiB56bZ3T1IlhxgbDDBFR21FhMiOn0NiorlvTjcHGO05kYc/pHATq3PE/UyNbpLWO7McwUw3DDBERkfO5le9v+drYiIiIiByAYYaIiIicGsMMEREROTWGGSIiInJqDDNERETk1BhmiIiIyKkxzBAREZFTY5ghIiIip8YwQ0RERE6NYYaIiIicGsMMEREROTWGGSIiInJqDDNERETk1BhmiIiIyKmp5C6guQkhAFi2EiciIiLnYP3etn6PN6TNh5nCwkIAQHBwsMyVEBER0a0qLCyETqdr8BqFaEzkcWJmsxlXrlyBVquFQqFw6HsbDAYEBwcjIyMD3t7eDn1vssV73XJ4r1sO73XL4b1uOY6610IIFBYWIjAwEC4uDY+KafMtMy4uLggKCmrWz/D29uYfjhbCe91yeK9bDu91y+G9bjmOuNc3a5Gx4gBgIiIicmoMM0REROTUGGbsoNFo8Prrr0Oj0chdSpvHe91yeK9bDu91y+G9bjly3Os2PwCYiIiI2ja2zBAREZFTY5ghIiIip8YwQ0RERE6NYYaIiIicGsNME/3zn/9EWFgY3NzcMGDAAOzfv1/ukpxeTEwMBg0aBK1WCz8/P0yfPh2nT5+2uUYIgejoaAQGBsLd3R2jRo1CSkqKTBW3HTExMVAoFFi6dKl0jPfacS5fvoxHH30U7du3h4eHB/r164eEhATpPO+1Y1RWVuLPf/4zwsLC4O7uji5duuCvf/0rzGazdA3vddPs27cPU6dORWBgIBQKBbZu3WpzvjH31Wg0YtGiRejQoQM8PT1x33334dKlS44pUNAt27Rpk3B1dRUff/yxOHnypFiyZInw9PQUFy9elLs0pzZhwgSxbt06ceLECZGUlCSmTJkiQkJCRFFRkXTNW2+9JbRarfjmm29EcnKyePjhh0VAQIAwGAwyVu7cjhw5Ijp37iyioqLEkiVLpOO8145x/fp1ERoaKubNmycOHz4s0tLSxO7du8XZs2ela3ivHeNvf/ubaN++vfj+++9FWlqa2Lx5s/Dy8hKrV6+WruG9bpoffvhBvPrqq+Kbb74RAMS3335rc74x93X+/PmiU6dOIjY2Vhw9elSMHj1a9O3bV1RWVtpdH8NMEwwePFjMnz/f5liPHj3En/70J5kqaptycnIEABEXFyeEEMJsNgu9Xi/eeust6ZqysjKh0+nEv/71L7nKdGqFhYUiIiJCxMbGipEjR0phhvfacV566SUxfPjwes/zXjvOlClTxBNPPGFzbObMmeLRRx8VQvBeO0rNMNOY+5qfny9cXV3Fpk2bpGsuX74sXFxcxI4dO+yuid1Mt6i8vBwJCQkYP368zfHx48fj4MGDMlXVNhUUFAAAfH19AQBpaWnIysqyufcajQYjR47kvW+iBQsWYMqUKRg7dqzNcd5rx9m2bRsGDhyIBx98EH5+fujfvz8+/vhj6TzvteMMHz4cP/30E86cOQMAOHbsGA4cOIDJkycD4L1uLo25rwkJCaioqLC5JjAwEL1793bIvW/zG0062rVr12AymeDv729z3N/fH1lZWTJV1fYIIbBs2TIMHz4cvXv3BgDp/tZ17y9evNjiNTq7TZs24ejRo4iPj691jvfacc6fP4+1a9di2bJleOWVV3DkyBEsXrwYGo0Gjz/+OO+1A7300ksoKChAjx49oFQqYTKZ8Oabb2LWrFkA+Pu6uTTmvmZlZUGtVqNdu3a1rnHEdyfDTBMpFAqb50KIWseo6RYuXIjjx4/jwIEDtc7x3tsvIyMDS5Yswa5du+Dm5lbvdbzX9jObzRg4cCBWrFgBAOjfvz9SUlKwdu1aPP7449J1vNf2++qrr/DFF19g48aN6NWrF5KSkrB06VIEBgZi7ty50nW8182jKffVUfee3Uy3qEOHDlAqlbWSZE5OTq1USk2zaNEibNu2DXv27EFQUJB0XK/XAwDvvQMkJCQgJycHAwYMgEqlgkqlQlxcHN5//32oVCrpfvJe2y8gIACRkZE2x3r27In09HQA/H3tSC+++CL+9Kc/4ZFHHkGfPn3w2GOP4fnnn0dMTAwA3uvm0pj7qtfrUV5ejry8vHqvsQfDzC1Sq9UYMGAAYmNjbY7HxsZi2LBhMlXVNgghsHDhQmzZsgU///wzwsLCbM6HhYVBr9fb3Pvy8nLExcXx3t+ie+65B8nJyUhKSpIeAwcOxJw5c5CUlIQuXbrwXjvIXXfdVWuJgTNnziA0NBQAf187UklJCVxcbL/WlEqlNDWb97p5NOa+DhgwAK6urjbXZGZm4sSJE46593YPIb4NWadmf/rpp+LkyZNi6dKlwtPTU1y4cEHu0pzas88+K3Q6ndi7d6/IzMyUHiUlJdI1b731ltDpdGLLli0iOTlZzJo1i9MqHaT6bCYheK8d5ciRI0KlUok333xTpKamig0bNggPDw/xxRdfSNfwXjvG3LlzRadOnaSp2Vu2bBEdOnQQy5cvl67hvW6awsJCkZiYKBITEwUAsWrVKpGYmCgtSdKY+zp//nwRFBQkdu/eLY4ePSrGjBnDqdly+/DDD0VoaKhQq9XijjvukKYPU9MBqPOxbt066Rqz2Sxef/11odfrhUajESNGjBDJycnyFd2G1AwzvNeO89///lf07t1baDQa0aNHD/HRRx/ZnOe9dgyDwSCWLFkiQkJChJubm+jSpYt49dVXhdFolK7hvW6aPXv21Pn389y5c4UQjbuvpaWlYuHChcLX11e4u7uLe++9V6SnpzukPoUQQtjfvkNEREQkD46ZISIiIqfGMENEREROjWGGiIiInBrDDBERETk1hhkiIiJyagwzRERE5NQYZoiIiMipMcwQERGRU2OYISKH6dy5M1avXt3o6/fu3QuFQoH8/Pxmq6k1udX7Q0SNo5K7ACKSz6hRo9CvXz+HfcHGx8fD09Oz0dcPGzYMmZmZ0Ol0Dvl8Iro9McwQUYOEEDCZTFCpbv7XRceOHW/pvdVqNfR6fVNLIyICwG4motvWvHnzEBcXh/feew8KhQIKhQIXLlyQun527tyJgQMHQqPRYP/+/Th37hymTZsGf39/eHl5YdCgQdi9e7fNe9bsRlEoFPjkk08wY8YMeHh4ICIiAtu2bZPO1+xm+uyzz+Dj44OdO3eiZ8+e8PLywsSJE5GZmSm9prKyEosXL4aPjw/at2+Pl156CXPnzsX06dMb/HkPHjyIESNGwN3dHcHBwVi8eDGKi4ttan/jjTcwe/ZseHl5ITAwEB988IHNe6Snp2PatGnw8vKCt7c3HnroIWRnZ9tcs23bNgwcOBBubm7o0KEDZs6caXO+pKQETzzxBLRaLUJCQvDRRx81WDcR3RzDDNFt6r333sPQoUPx1FNPITMzE5mZmQgODpbOL1++HDExMTh16hSioqJQVFSEyZMnY/fu3UhMTMSECRMwdepUpKenN/g5f/nLX/DQQw/h+PHjmDx5MubMmYPr16/Xe31JSQn+8Y9/4D//+Q/27duH9PR0vPDCC9L5v//979iwYQPWrVuHX375BQaDAVu3bm2whuTkZEyYMAEzZ87E8ePH8dVXX+HAgQNYuHChzXVvv/02oqKicPToUbz88st4/vnnERsbC8DSQjV9+nRcv34dcXFxiI2Nxblz5/Dwww9Lr9++fTtmzpyJKVOmIDExET/99BMGDhxo8xnvvPMOBg4ciMTERDz33HN49tln8fvvvzdYPxHdhEP23iYipzRy5EixZMkSm2N79uwRAMTWrVtv+vrIyEjxwQcfSM9DQ0PFu+++Kz0HIP785z9Lz4uKioRCoRA//vijzWfl5eUJIYRYt26dACDOnj0rvebDDz8U/v7+0nN/f3/x9ttvS88rKytFSEiImDZtWr11PvbYY+Lpp5+2ObZ//37h4uIiSktLpdonTpxoc83DDz8sJk2aJIQQYteuXUKpVIr09HTpfEpKigAgjhw5IoQQYujQoWLOnDn11hEaGioeffRR6bnZbBZ+fn5i7dq19b6GiG6OLTNEVKeaLQrFxcVYvnw5IiMj4ePjAy8vL/z+++83bZmJioqS/t/T0xNarRY5OTn1Xu/h4YGuXbtKzwMCAqTrCwoKkJ2djcGDB0vnlUolBgwY0GANCQkJ+Oyzz+Dl5SU9JkyYALPZjLS0NOm6oUOH2rxu6NChOHXqFADg1KlTCA4Otmm9st4L6zVJSUm45557Gqyl+v1QKBTQ6/UN3g8iujkOACaiOtWclfTiiy9i586d+Mc//oHw8HC4u7vjgQceQHl5eYPv4+rqavNcoVDAbDbf0vVCiFrHqqt5viaz2YxnnnkGixcvrnUuJCSkwddaP0sIUetzax53d3dv8L2AW78fRHRzbJkhuo2p1WqYTKZGXbt//37MmzcPM2bMQJ8+faDX63HhwoXmLbAGnU4Hf39/HDlyRDpmMpmQmJjY4OvuuOMOpKSkIDw8vNZDrVZL1x06dMjmdYcOHUKPHj0AWFph0tPTkZGRIZ0/efIkCgoK0LNnTwCWVpeffvrJ7p+TiG4NW2aIbmOdO3fG4cOHceHCBXh5ecHX17fea8PDw7FlyxZMnToVCoUCr732miwtCosWLUJMTAzCw8PRo0cPfPDBB8jLy6uz1cTqpZdewp133okFCxbgqaeegqenJ06dOoXY2FibGUu//PILVq5cienTpyM2NhabN2/G9u3bAQBjx45FVFQU5syZg9WrV6OyshLPPfccRo4cKXXJvf7667jnnnvQtWtXPPLII6isrMSPP/6I5cuXN+9NIbrNsWWG6Db2wgsvQKlUIjIyEh07dmxw/Mu7776Ldu3aYdiwYZg6dSomTJiAO+64owWrtXjppZcwa9YsPP744xg6dKg0/sXNza3e10RFRSEuLg6pqam4++670b9/f7z22msICAiwue6Pf/wjEhIS0L9/f7zxxht45513MGHCBACW7qCtW7eiXbt2GDFiBMaOHYsuXbrgq6++kl4/atQobN68Gdu2bUO/fv0wZswYHD58uHluBBFJFOJmnc1ERK2Y2WxGz5498dBDD+GNN95o8vt07twZS5cuxdKlSx1XHBG1CHYzEZFTuXjxInbt2oWRI0fCaDRizZo1SEtLw+zZs+UujYhkwm4mInIqLi4u+OyzzzBo0CDcddddSE5Oxu7du6VBuER0+2E3ExERETk1tswQERGRU2OYISIiIqfGMENEREROjWGGiIiInBrDDBERETk1hhkiIiJyagwzRERE5NQYZoiIiMip/X8rREz6aNGI+AAAAABJRU5ErkJggg==",
      "text/plain": [
       "<Figure size 640x480 with 1 Axes>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# 为对比学习负采样准备词频率分布\n",
    "vocab_size = len(dataset.token2id)\n",
    "embed_size = 128\n",
    "distribution = dataset.get_word_distribution()\n",
    "print(distribution)\n",
    "model = SkipGramNCE(vocab_size, embed_size, distribution)\n",
    "\n",
    "from torch.utils.data import DataLoader\n",
    "from torch.optim import SGD, Adam\n",
    "\n",
    "# 定义静态方法collate_batch批量处理数据，转化为PyTorch可以需要的张量类型\n",
    "class DataCollator:\n",
    "    @classmethod\n",
    "    def collate_batch(cls, batch):\n",
    "        batch = np.array(batch)\n",
    "        input_ids = torch.tensor(batch[:, 0], dtype=torch.long)\n",
    "        labels = torch.tensor(batch[:, 1], dtype=torch.long)\n",
    "        return {'input_ids': input_ids, 'labels': labels}\n",
    "\n",
    "# 定义训练参数以及训练循环\n",
    "epochs = 100\n",
    "batch_size = 128\n",
    "learning_rate = 1e-3\n",
    "epoch_loss = []\n",
    "\n",
    "data_collator = DataCollator()\n",
    "dataloader = DataLoader(data, batch_size=batch_size, shuffle=True,\\\n",
    "    collate_fn=data_collator.collate_batch)\n",
    "optimizer = Adam(model.parameters(), lr=learning_rate)\n",
    "model.zero_grad()\n",
    "model.train()\n",
    "\n",
    "# 需要提前安装tqdm\n",
    "from tqdm import trange\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "# 训练过程，每步读取数据，送入模型计算损失，并使用PyTorch进行优化\n",
    "with trange(epochs, desc='epoch', ncols=60) as pbar:\n",
    "    for epoch in pbar:\n",
    "        for step, batch in enumerate(dataloader):\n",
    "            loss = model(**batch)\n",
    "            pbar.set_description(f'epoch-{epoch}, loss={loss.item():.4f}')\n",
    "            loss.backward()\n",
    "            optimizer.step()\n",
    "            model.zero_grad()\n",
    "        epoch_loss.append(loss.item())\n",
    "    \n",
    "epoch_loss = np.array(epoch_loss)\n",
    "plt.plot(range(len(epoch_loss)), epoch_loss)\n",
    "plt.xlabel('training epoch')\n",
    "plt.ylabel('loss')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c9430e9a",
   "metadata": {},
   "source": [
    "TF-IDF加权\n",
    "\n",
    "定义词频率（term frequency）。注意到不同长度的文章词频率会有较大差距，不利于比较和运算，因此可以对词频率取对数。\n",
    "\n",
    "$$\\text{tf}_{t,d} = \\log (\\text{count}(t,d) + 1)$$\n",
    "\n",
    "其中$\\text{count}(t,d)$表示词$t$在文档$d$中出现的次数，为了避免对0取对数，把所有的计数加1。\n",
    "\n",
    "那么如何区分高频词与低频词呢？TF-IDF引入了另一个重要的评价指标——文档频率（document frequency），即一个词在语料库所包含的多少篇文档中出现。在所有文档里出现的词往往是虚词或是常见实词，而只在少量文档里出现的词往往是具有明确含义的实词并且具有很强的文档区分度。用$\\text{df}_t$来表示在多少篇文档中出现了词$t$。\n",
    "\n",
    "为了压低高频词和提升低频词的影响，TF-IDF使用文档频率的倒数，也就是逆向文档频率（inverse document frequency）来对词频率进行加权。这很好理解，一个词的文档频率越高，其倒数就越小，权重就越小。\n",
    "\n",
    "$$\\text{idf}_t = \\log \\frac{N}{\\text{df}_t}$$\n",
    "\n",
    "其中$N$表示文档总数。为了避免分母为0，通常会将分母改为$\\text{df}_t+1$。\n",
    "\n",
    "基于词频率和逆向文档频率，得到TF-IDF的最终值为：\n",
    "\n",
    "$$w_{t,d} = \\text{tf}_{t,d} \\times \\text{idf}_{t}$$\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f765e353",
   "metadata": {},
   "source": [
    "很多情况下会额外对文档的TF-IDF向量使用L2归一化，使得不同文档的TF-IDF向量具有相同的模长，便于相互比较。\n",
    "下面给出了TF-IDF的代码实现。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "9ce8e610",
   "metadata": {},
   "outputs": [],
   "source": [
    "class TFIDF:\n",
    "    def __init__(self, vocab_size, norm='l2', smooth_idf=True,\\\n",
    "                 sublinear_tf=True):\n",
    "        self.vocab_size = vocab_size\n",
    "        self.norm = norm\n",
    "        self.smooth_idf = smooth_idf\n",
    "        self.sublinear_tf = sublinear_tf\n",
    "    \n",
    "    def fit(self, X):\n",
    "        doc_freq = np.zeros(self.vocab_size, dtype=np.float64)\n",
    "        for data in X:\n",
    "            for token_id in set(data):\n",
    "                doc_freq[token_id] += 1\n",
    "        doc_freq += int(self.smooth_idf)\n",
    "        n_samples = len(X) + int(self.smooth_idf)\n",
    "        self.idf = np.log(n_samples / doc_freq) + 1\n",
    "    \n",
    "    def transform(self, X):\n",
    "        assert hasattr(self, 'idf')\n",
    "        term_freq = np.zeros((len(X), self.vocab_size), dtype=np.float64)\n",
    "        for i, data in enumerate(X):\n",
    "            for token in data:\n",
    "                term_freq[i, token] += 1\n",
    "        if self.sublinear_tf:\n",
    "            term_freq = np.log(term_freq + 1)\n",
    "        Y = term_freq * self.idf\n",
    "        if self.norm:\n",
    "            row_norm = (Y**2).sum(axis=1)\n",
    "            row_norm[row_norm == 0] = 1\n",
    "            Y /= np.sqrt(row_norm)[:, None]\n",
    "        return Y\n",
    "    \n",
    "    def fit_transform(self, X):\n",
    "        self.fit(X)\n",
    "        return self.transform(X)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
